/*******************************************************************************
Updated: February 22 2023
********************************************************************************/

cls
clear all
local dir "~\Dropbox\Working Papers\Distinction Effect"
cd "`dir'"

qui do "Does\Data Processing\0. Cleaning Programs.do"

/*******************************************************************************
					1. Universe : 2006 to 2010
********************************************************************************/

tempfile TestTakersUniverse
foreach t in 20061 20062 20071 20072 20081 20082 20092 20102 20103 {
	
	//local t = 20092
	import delimited "Data\Originals\Saber Pro\Saber Pro `t'.txt", encoding("utf-8")  stringc(_all) clear
	*keep in 1/5000
	
	//IDs
	rename estu_consecutivo consecutivo
	rename inpe_primernombre spro_name1
	rename inpe_segundonombre spro_name2
	rename inpe_primerapellido spro_surname1
	rename inpe_segundoapellido spro_surname2
	rename eval_documento national_id
	rename eval_tipodocumento national_id_type
	rename inst_cod_institucion spro_cllgcode
	cap rename inst_pertenece_evaluado spro_cllgname
	cap rename inst_nombre_institucion spro_cllgname
	rename estu_prgm_academico_cod spro_programcode
	rename inst_prac_consecutivosnies spro_sniescode
	
	//Characteristics
	rename estu_genero spro_gender
	rename estu_semestre_cursando spro_semester
	rename estu_estrato spro_stratum
	rename estu_estado_civil spro_maritalstatus
	rename estu_reside_codmpio spro_munireside
	rename estu_reside_mpio spro_municipio
	rename estu_reside_dept spro_departamento	
	rename fami_cod_educa_madre spro_educmother
	rename fami_cod_ocup_madre spro_occupmother
	rename fami_cod_educa_padre spro_educfather
	rename fami_cod_ocup_padre spro_occupfather
	rename estu_zona spro_munizone
	gen spro_birthdate = date(estu_nacimiento_dia + "/" + estu_nacimiento_mes + "/" + estu_nacimiento_anno, "DMY")
	format spro_birthdate %td
	cap rename estu_trabaja spro_work
	cap rename estu_horas_trabajo spro_hrswork
	cap rename inst_vlr_matricula_ant spro_cllgtuition
	
	//High School Exit Exam
	rename estu_exam_anno_presentacion  spro_sb11year
	rename estu_exam_semestre_prestacion spro_sb11sem
	
	//Test Scores
	rename *_ingles_punt spro_english
	rename *_ingles_desem spro_english_per
	rename *_comp*_lect*_punt spro_reading
	rename *_comp*_lect*_desem spro_reading_per
	cap rename comp_comunica_escri_punt spro_writing
	cap rename comp_comunica_escri_desem spro_writing_per
	cap rename comp_enten_interper_punt spro_persunderstand
	cap rename comp_pensa_criti_punt spro_criticalthink
	cap rename comp_sol_problemas_punt spro_problemsolve
	
	//Exam Periods
	gen exam_time = "`t'"
	gen exam_year = substr(exam_time, 1, 4)
	gen exam_semester = substr(exam_time, -1, 1)
	
	//Keep Important Variables
	keep exam_* consecutivo national* spro_* 	
	
	//Append Years
	cap append using `TestTakersUniverse'
	save `TestTakersUniverse', replace	
}

//Cleaning 

drop exam_nombre
destring spro_* exam_*, replace

replace spro_gender = upper(spro_gender)
rename spro_gender aux
gen spro_gender = (aux == "F") if aux != ""
lab define gender 1 "Female" 0 "Male"
lab values spro_gender gender
drop aux
	
global Tests spro_english spro_reading spro_writing spro_persunderstand spro_criticalthink spro_problemsolve
destring $Tests, replace dpcomma
foreach score in $Tests {
	replace `score' = . if `score' < 0
}
	
replace spro_english_per = trim(itrim(spro_english_per))	
foreach var of varlist spro_*_per {
	replace `var' = "" if `var' == "-1" | `var' == "NA"
}

save `TestTakersUniverse', replace
	
/*******************************************************************************
					2. Field-Specific Test Scores
*******************************************************************************/

tempfile SpecificTestScores
foreach t in 20061 20062 20071 20072 20081 20082 20092 {
	
	import delimited "Data\Originals\Saber Pro\ESP_`t'.txt", delim("|") encoding("utf-8") clear
	*keep in 1/1000
	
	//Prueba ID
	//Keep only scores from a student main Exam
	bys estu_consecutivo prue_id : egen aux = seq()
	bys estu_consecutivo : egen aux2 = max(aux)
	gen aux3 = prue_id if aux == aux2
	bys estu_consecutivo : egen aux4 = min(aux3) 
	keep if prue_id == aux4 | prue_id == 0
	drop prue_id 
	rename aux4 prue_id
	drop aux*
	
	//ID Prueba
	egen id = group(prue_id comp_id)
	bys prue_id : egen aux = min(id) if comp_id != 0
	replace id = id - aux + 1
	replace id = 0 if comp_id == 0
	drop aux
		
	rename caco_puntaje spro_score
	rename caco_desempeno spro_levelper
	destring spro_score, replace dpcomma

	labmask comp_id, values(comp_nombre)		
	drop comp_nombre
	
	reshape wide comp_id spro_score spro_levelper, j(id) i(estu_consecutivo)
	
	rename estu_consecutivo consecutivo
		
	cap rename caco_periodo exam_time
	cap gen exam_time = 20092
	
	cap append using `SpecificTestScores'
	save `SpecificTestScores', replace
}

rename prue_id exam_id
rename prue_nombre exam_name
rename spro_score0 spro_overallscore
drop spro_levelper0

order consecutivo exam_time exam_name exam_id

spell exam_name, clean(exam_name)

save `SpecificTestScores', replace

/*******************************************************************************
							3. Awardees
********************************************************************************/

tempfile Awardees
global T 2006-1 2006-2 2007-1 2007-2 2008-1 2008-2 2009-2
foreach t in $T {
	import excel "Data\Originals\Awardees\Mejores Saber pro `t'.xlsx", first clear case(lower)
	
	gen exam_time = subinstr("`t'", "-", "", .)
	destring exam_time, replace
	
	replace puntaje = subinstr(puntaje, ".", ",", .)
	destring puntaje, replace dpcomma
	rename inst institucion
	
	cap append using `Awardees'
	save `Awardees', replace
}

rename nombre name
rename institucion spro_cllgname

global C name spro_cllgname departamento municipio examen
spell $C, clean($C)

drop if examen == "ENTENDIMIENTO INTERPERSONAL"
drop if examen == "PENSAMIENTO CRITICO"
drop if examen == "SOLUCION DE PROBLEMAS"
drop if examen == "INGLÉS"

gen id_master = _n

save `Awardees', replace

/*******************************************************************************
				4. Phonetic Merge Awardees to Universe 
********************************************************************************/
use `TestTakersUniverse', clear
merge 1:1 consecutivo using `SpecificTestScores'
keep if _merge == 3 

//Merge Variables
gen name = spro_surname1 + " " + spro_surname2 + " " + spro_name1 + " " + spro_name2

keep consecutivo name spro_cllgname exam_time exam_name spro_municipio spro_departamento spro_overallscore

spell name spro_cllgname spro_municipio spro_departamento, clean(name spro_cllgname)
replace name = upper(name)
replace spro_cllgname = upper(spro_cllgname)

//Keep only 85th Percentile
egen p_overall = xtile(spro_overallscore), by(exam_time) nq(100)
keep if p_overall >= 85 & spro_overallscore != .

gen id_using = _n

tempfile SPro85thPercentile
save `SPro85thPercentile', replace

//Phonetic/Fuzzy Merge
use `Awardees', clear
reclink name spro_cllgname exam_time using `SPro85thPercentile', idm(id_master) idu(id_using) require(exam_time) gen(phone_score)
keep if _merge == 3

//Compare Merge Using Different Methods
matchit name Uname, sim(bigram) gen(phone_score1)
matchit examen exam_name, sim(bigram) g(phone_score2)
matchit spro_cllgname Uspro_cllgname, sim(soundex) g(phone_score3)
replace phone_score2 = 1 if regexm(examen, "SUPERIOR") & regexm(exam_name, "SUPERIOR")
replace phone_score3 = 1 if regexm(spro_cllgname, "SUPERIOR") & regexm(Uspro_cllgname, "SUPERIOR")
replace phone_score3 = 1 if regexm(spro_cllgname, "NORMAL") & regexm(Uspro_cllgname, "NORMAL")
gen phone_score4 = phone_score1*phone_score2

//Drop Duplicates
bys id_using : egen aux = max(phone_score4)
keep if phone_score4 == aux
drop aux

break
dis in red "Contact authors for details"

rename phone_score1 merge_score
order consecutivo name merge_score
keep consecutivo merge_score id_master 

merge 1:1 id_master using `Awardees'
keep if _merge == 3 | _merge == 2
drop _merge

save `Awardees', replace

/*******************************************************************************
			5. Merge Universe + Field-Specific Scores + Awardees
********************************************************************************/
use `TestTakersUniverse', clear
drop if consecutivo == ""

//Merge Field-Specific Scores
merge 1:1 consecutivo using `SpecificTestScores'
gen merge_specific_test = (_merge == 3)
keep if _merge == 3 | _merge == 1
drop _merge

//Merge Awardees
merge 1:m consecutivo using `Awardees', keepusing(puntaje)
gen merge_specific_award = (_merge == 3)
keep if _merge == 3 | _merge == 1
drop _merge puntaje

save "Data\Intermediates\SaberPro_2006-2010.dta", replace

//Cleaning
use "Data\Intermediates\SaberPro_2006-2010.dta", clear

egen group = group(exam_id exam_time)

egen double cutoff_aux = min(spro_overallscore) if merge_specific_award == 1, by(group)
egen double cutoff = min(cutoff_aux), by(group)
gen double score = spro_overallscore - cutoff
drop cutoff_aux

tab exam_name exam_time if merge_specific_award==0 & score >= 0 & exam_time <= 20092

gen aux = 0 if exam_time <= 20092 & merge_specific_test == 1
replace aux = 1 if exam_id == 9 //Architecture
replace aux = 1 if exam_id == 317 //Sports and PE
replace aux = 1 if exam_id == 33 //Humanities and Spanish Education
replace aux = 1 if exam_id == 25 & exam_time == 20072 //Psychology 
replace aux = 1 if exam_id == 148 & exam_time == 20092 //Geology
replace aux = 1 if exam_id == 42 & (exam_time == 20081 | exam_time == 20092) //Modern Languages English
replace aux = 1 if exam_id == 43 & (exam_time == 20061 | exam_time == 20071) //Modern Languages French
replace aux = 1 if exam_id == 288 & exam_time == 20061 //Pedagogy Diploma (Normalista)

tab exam_name exam_time if merge_specific_award==0 & score >= 0 & aux == 0 //Still 35 students missclasified

tab exam_name exam_time if merge_specific_award==1 & score < 0 & aux == 0

cap drop spro_specific_award
gen spro_specific_award = 1 if score >= 0 & aux == 0 //35 obs
replace spro_specific_award = 0 if  score < 0 & aux == 0
drop aux score cutoff*

egen double cutoff_aux = min(spro_overallscore) if spro_specific_award == 1, by(group)
egen double cutoff = min(cutoff_aux), by(group)
gen double score = spro_overallscore - cutoff

tab exam_name exam_time if spro_specific_award==0 & score >= 0

replace score = round(score, 0.01) if score != .
replace score = score + 0.15 if score < -0.22 & score != .
replace score = -0.21 if score < 0 & score > -0.1 & score != .
replace score = score - 0.10 if score > 0 & score != .
replace score = 0.31 if score > 0 & score < 0.3 & score != .
replace score = round(score, 0.05) if score != . & abs(score) < 1

cap drop spro_puntaje
gen double spro_puntaje = score + cutoff
replace spro_puntaje = spro_overallscore if spro_puntaje == .
order spro_overallscore spro_puntaje, after(comp_id0)
drop score cutoff*
drop group

save "Data\Intermediates\SaberPro_2006-2010.dta", replace

/*******************************************************************************
				6. SNIES Program Codes
********************************************************************************/
import excel using "Data\Originals\SNIES\Estudiantes matriculados hasta 2013.xlsx", firstrow case(lower) allstring  cellrange(A7) clear

rename c*digodelainstituci*n spro_cllgcode
rename c*digosniesdelprograma spro_sniescode
rename sectories cllg_prv
rename niveldeformaci*n program_level
rename metodologíadelprograma program_delvrymthd
rename *readeconocimiento program_area
rename n*cleob*sicodelconocimiento program_field

//Keep 2/3-year and 4/5-year programs
keep if program_level == "TECNICA PROFESIONAL" | program_level == "TECNOLOGICA" | program_level == "UNIVERSITARIA"
destring spro_cllgcode spro_sniescode, replace force
drop if spro_cllgcode == .
drop if spro_sniescode == .

//Drop duplicates
duplicates drop spro_cllgcode spro_sniescode, force

keep spro_* cllg_* program_*

//Encode 
foreach var of varlist program_level program_delvrymthd program_area {
    rename `var' aux
	encode aux, gen(`var')
	drop aux
}
rename cllg_prv aux
gen cllg_prv = (aux == "PRIVADA")
lab define prv 1 "PRIVADA" 0 "PUBLICA"
lab value cllg_prv prv
drop aux

//Save
save "Data\Intermediates\SNIESPrograms", replace

//Merge to Universe
use "Data\Intermediates\SaberPro_2006-2010.dta", clear

cap drop program_field
merge m:1 spro_cllgcode spro_sniescode using "Data\Intermediates\SNIESPrograms"
keep if _merge == 1 | _merge == 3
drop _merge

order program_level-program_field, after(spro_sniescode)
order cllg_prv, after(spro_cllgcode)

replace spro_cllgname = trim(itrim(spro_cllgname))

compress
save "Data\Intermediates\SaberPro_2006-2010.dta", replace
